TL;DR

Given the results of the analysis of the Fluidigm data, here I explore the role of the V intercept in a more complex data (Allen).

We want to check if the intercept acts as a size factor here too and, if true, if W capture some interesting biology.

Interestingly, in this more complex dataset, both intercepts are needed to get a clear picture of the data in two dimensions.

Data

As a first pass, I will only focus on the cells that passed the QC filters (by the original authors) and that were part of the “core” clusters. I will color-code the cells by either known cell type or by inferred cluster (inferred in the original study).

Select cells, remove ERCC spike-ins, filter out the genes that do not have at least 10 counts in at least 10 cells.

data("allen")
allen_core <- allen[grep("^ERCC-", rownames(allen), invert = TRUE),
                    which(colData(allen)$Core.Type=="Core")]

filter <- apply(assay(allen_core)>10, 1, sum)>=10

Number of retained genes:

print(sum(filter))
## [1] 11545

To speed up the computations, I will focus on the top 1,000 most variable genes.

allen_core <- allen_core[filter,]
core <- assay(allen_core)

vars <- rowVars(log1p(core))
names(vars) <- rownames(core)
vars <- sort(vars, decreasing = TRUE)
core <- core[names(vars)[1:1000],]

First, let’s look at PCA (of the log counts) for reference.

par(mfrow=c(1, 2))
bio <- as.factor(colData(allen_core)$driver_1_s)
cl <- as.factor(colData(allen_core)$Primary.Type)

detection_rate <- colSums(core>0)
coverage <- colSums(core)

library(EDASeq)
fq <- betweenLaneNormalization(core, which="full")
pca <- prcomp(t(log1p(fq)))
plot(pca$x, col=cols[bio], pch=19, main="PCA of log-counts, centered not scaled")
legend("bottomleft", levels(bio), fill=cols)
plot(pca$x, col=cols2[cl], pch=19, main="PCA of log-counts, centered not scaled")

df <- data.frame(PC1=pca$x[,1], PC2=pca$x[,2], detection_rate=detection_rate, coverage=coverage)
pairs(df, pch=19, col=cols[bio])

print(cor(df, method="spearman"))
##                      PC1        PC2 detection_rate   coverage
## PC1            1.0000000  0.2269928      0.2543772  0.0875427
## PC2            0.2269928  1.0000000     -0.4918824 -0.1161663
## detection_rate 0.2543772 -0.4918824      1.0000000  0.5275845
## coverage       0.0875427 -0.1161663      0.5275845  1.0000000

V intercept in both Mu and Pi

print(system.time(zinb_Vall <- zinbFit(core, ncores = 3, K = 2)))
##    user  system elapsed 
## 103.392  27.817  55.820

Plot the results with cells colored according to their biological condition.

par(mfrow=c(1, 2))
plot(zinb_Vall@W, col=cols[bio], pch=19, xlab="W1", ylab="W2", main="Both V intercepts")
legend("bottomright", levels(bio), fill=cols)

plot(zinb_Vall@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2", main="Both V intercepts")
legend("topright", levels(cl), fill=cols2)

Explore Gamma estimates

One interpretation of the model is that \(\gamma_mu\) will act as a “size factor”. Here, we check whether it captures sequencing depth and/or detection rate.

#total number of detected genes in the cell
df <- data.frame(W1=zinb_Vall@W[,1], W2=zinb_Vall@W[,2], gamma_mu = zinb_Vall@gamma_mu[1,], gamma_pi = zinb_Vall@gamma_pi[1,], detection_rate=detection_rate, coverage=coverage)
pairs(df, pch=19, col=cols[bio])

print(cor(df, method="spearman"))
##                         W1          W2    gamma_mu   gamma_pi
## W1              1.00000000  0.03507182 -0.08456147  0.3600626
## W2              0.03507182  1.00000000 -0.36770620  0.3713183
## gamma_mu       -0.08456147 -0.36770620  1.00000000 -0.3353419
## gamma_pi        0.36006262  0.37131830 -0.33534190  1.0000000
## detection_rate -0.35181563 -0.34016219  0.37149947 -0.9939488
## coverage       -0.05237705 -0.05644585  0.85040460 -0.4845843
##                detection_rate    coverage
## W1                 -0.3518156 -0.05237705
## W2                 -0.3401622 -0.05644585
## gamma_mu            0.3714995  0.85040460
## gamma_pi           -0.9939488 -0.48458428
## detection_rate      1.0000000  0.52758451
## coverage            0.5275845  1.00000000

\(\gamma_pi\) clearly captures the detection rate and \(\gamma_mu\) clearly captures the coverage. Interestingly, the two estimates are positively correlated.

par(mfrow=c(1, 2))
plot(rowMeans(getPi(zinb_Vall)), detection_rate, xlab="Average estimated Pi", ylab="detection rate", pch=19, col=cols[bio])

plot(rowMeans(log1p(getMu(zinb_Vall))), coverage, xlab="Average estimated log Mu", ylab="coverage", pch=19, col=cols[bio])

It is worth looking at the other estimates as well, namely, \(\beta\) and \(\alpha\).

par(mfrow=c(2, 2))
boxplot(zinb_Vall@beta_mu[1,], main="Beta_Mu")
boxplot(zinb_Vall@beta_pi[1,], main="Beta_Pi")

plot(t(zinb_Vall@alpha_mu), xlab="Alpha_Mu 1", ylab="Alpha_Mu 2", main="Alpha_Mu")
plot(t(zinb_Vall@alpha_pi), xlab="Alpha_Pi 1", ylab="Alpha_Pi 2", main="Alpha_Pi")

No V intercept

print(system.time(zinb_Vnone <- zinbFit(core, ncores = 3, K = 2, V=matrix(0, ncol=1, nrow=NROW(core)))))
##    user  system elapsed 
## 121.594  30.205  60.748
par(mfrow=c(1, 2))
plot(zinb_Vnone@W, col=cols[bio], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(bio), fill=cols)

plot(zinb_Vnone@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2")

df <- data.frame(W1 = zinb_Vnone@W[,1], W2 = zinb_Vnone@W[,2], detection_rate=detection_rate, coverage=coverage)
pairs(df, pch=19, col=cols[bio])

print(cor(df, method="spearman"))
##                          W1          W2 detection_rate   coverage
## W1              1.000000000 0.005234128     -0.4111624 -0.1333461
## W2              0.005234128 1.000000000      0.8734817  0.5863095
## detection_rate -0.411162366 0.873481664      1.0000000  0.5275845
## coverage       -0.133346120 0.586309465      0.5275845  1.0000000
par(mfrow=c(1, 2))
plot(rowMeans(getPi(zinb_Vnone)), detection_rate, xlab="Average estimated Pi", ylab="detection rate", pch=19, col=cols[bio])

plot(rowMeans(log1p(getMu(zinb_Vnone))), coverage, xlab="Average estimated log Mu", ylab="coverage", pch=19, col=cols[bio])

It is worth looking at the other estimates as well, namely, \(\beta\) and \(\alpha\).

par(mfrow=c(2, 2))
boxplot(zinb_Vnone@beta_mu[1,], main="Beta_Mu")
boxplot(zinb_Vnone@beta_pi[1,], main="Beta_Pi")

plot(t(zinb_Vnone@alpha_mu), xlab="Alpha_Mu 1", ylab="Alpha_Mu 2", main="Alpha_Mu")
plot(t(zinb_Vnone@alpha_pi), xlab="Alpha_Pi 1", ylab="Alpha_Pi 2", main="Alpha_Pi")

V intercept only in Mu

V <- cbind(rep(0, NROW(core)), rep(1, NROW(core)))

print(system.time(zinb_Vmu <- zinbFit(core, ncores = 3, K = 2, V=V, which_V_mu=2L, which_V_pi=1L)))
##    user  system elapsed 
## 121.534  32.194  63.892
par(mfrow=c(1, 2))
plot(zinb_Vmu@W, col=cols[bio], pch=19, xlab="W1", ylab="W2")
plot(zinb_Vmu@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2")

df <- data.frame(W1 = zinb_Vmu@W[,1], W2 = zinb_Vmu@W[,2], gamma_mu = zinb_Vmu@gamma_mu[1,], detection_rate=detection_rate, coverage=coverage)
pairs(df, pch=19, col=cols[bio])

print(cor(df, method="spearman"))
##                         W1          W2  gamma_mu detection_rate   coverage
## W1              1.00000000 -0.05849243 0.0479402     -0.2684243 -0.1091023
## W2             -0.05849243  1.00000000 0.5594919      0.8888512  0.3763285
## gamma_mu        0.04794020  0.55949188 1.0000000      0.5882734  0.9234284
## detection_rate -0.26842434  0.88885119 0.5882734      1.0000000  0.5275845
## coverage       -0.10910231  0.37632849 0.9234284      0.5275845  1.0000000
par(mfrow=c(1, 2))
plot(rowMeans(getPi(zinb_Vmu)), detection_rate, xlab="Average estimated Pi", ylab="detection rate", pch=19, col=cols[bio])

plot(rowMeans(log1p(getMu(zinb_Vmu))), coverage, xlab="Average estimated log Mu", ylab="coverage", pch=19, col=cols[bio])

It is worth looking at the other estimates as well, namely, \(\beta\) and \(\alpha\).

par(mfrow=c(2, 2))
boxplot(zinb_Vmu@beta_mu[1,], main="Beta_Mu")
boxplot(zinb_Vmu@beta_pi[1,], main="Beta_Pi")

plot(t(zinb_Vmu@alpha_mu), xlab="Alpha_Mu 1", ylab="Alpha_Mu 2", main="Alpha_Mu")
plot(t(zinb_Vmu@alpha_pi), xlab="Alpha_Pi 1", ylab="Alpha_Pi 2", main="Alpha_Pi")

V intercept only in Pi

print(system.time(zinb_Vpi <- zinbFit(core, ncores = 3, K = 2, V=V, which_V_mu=1L, which_V_pi=2L)))
##    user  system elapsed 
## 164.088  44.875  87.087
par(mfrow=c(1, 2))
plot(zinb_Vpi@W, col=cols[bio], pch=19, xlab="W1", ylab="W2")
plot(zinb_Vpi@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2")

df <- data.frame(W1 = zinb_Vpi@W[,1], W2 = zinb_Vpi@W[,2], gamma_pi = zinb_Vpi@gamma_pi[1,], detection_rate=detection_rate, coverage=coverage)
pairs(df, pch=19, col=cols[bio])

print(cor(df, method="spearman"))
##                         W1         W2    gamma_pi detection_rate
## W1              1.00000000  0.1300077  0.09476535    -0.09798919
## W2              0.13000772  1.0000000 -0.37655658     0.27521553
## gamma_pi        0.09476535 -0.3765566  1.00000000    -0.98666386
## detection_rate -0.09798919  0.2752155 -0.98666386     1.00000000
## coverage        0.15573929 -0.2577952 -0.47891212     0.52758451
##                  coverage
## W1              0.1557393
## W2             -0.2577952
## gamma_pi       -0.4789121
## detection_rate  0.5275845
## coverage        1.0000000
par(mfrow=c(1, 2))
plot(rowMeans(getPi(zinb_Vpi)), detection_rate, xlab="Average estimated Pi", ylab="detection rate", pch=19, col=cols[bio])

plot(rowMeans(log1p(getMu(zinb_Vpi))), coverage, xlab="Average estimated log Mu", ylab="coverage", pch=19, col=cols[bio])

It is worth looking at the other estimates as well, namely, \(\beta\) and \(\alpha\).

par(mfrow=c(2, 2))
boxplot(zinb_Vpi@beta_mu[1,], main="Beta_Mu")
boxplot(zinb_Vpi@beta_pi[1,], main="Beta_Pi")

plot(t(zinb_Vpi@alpha_mu), xlab="Alpha_Mu 1", ylab="Alpha_Mu 2", main="Alpha_Mu")
plot(t(zinb_Vpi@alpha_pi), xlab="Alpha_Pi 1", ylab="Alpha_Pi 2", main="Alpha_Pi")

Do we need more than two dimensions?

print(system.time(zinb_3 <- zinbFit(core, ncores = 3, K = 3)))
##    user  system elapsed 
## 125.764  30.668  63.766
pairs(zinb_3@W, col=cols[bio], pch=19)

pairs(zinb_3@W, col=cols2[cl], pch=19)

## write matrices to file to feed to ZIFA in python
write.csv(log1p(core), file="logcounts_allen.csv")

Using all genes

core <- assay(allen_core)
dim(core)
## [1] 11545   285
print(system.time(zinb_all <- zinbFit(core, ncores = 3, K = 2)))
##     user   system  elapsed 
## 1147.669  169.697  568.605
par(mfrow=c(1, 2))
plot(zinb_all@W, col=cols[bio], pch=19, xlab="W1", ylab="W2")
plot(zinb_all@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2")

Interestingly, when using all the genes, things don’t work nicely anymore. I wonder if the difference is the selection of the most variable genes, rather than the complexity of the data.

ZIFA (top 1000 most variable genes)

Full algorithm

par(mfrow=c(1, 2))
zifa_res <- read.csv("zifa_allen_full.csv", header=FALSE)

plot(zifa_res, col=cols[bio], pch=19, xlab="Z1", ylab="Z2")
plot(zifa_res, col=cols2[cl], pch=19, xlab="Z1", ylab="Z2")

Block algorithm

par(mfrow=c(1, 2))
zifa_res <- read.csv("zifa_allen.csv", header=FALSE)

plot(zifa_res, col=cols[bio], pch=19, xlab="Z1", ylab="Z2")
plot(zifa_res, col=cols2[cl], pch=19, xlab="Z1", ylab="Z2")